In [1]:
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex/raw_feature_bc_matrix.h5 ./mouse/
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex_ADT/mouse_cortex_ADT.csv ./mouse/
In [2]:
#!pegasus demuxEM -p 8 --min-num-genes 200 --generate-diagnostic-plots ./mouse/mouse_cortex_ADT.csv ./mouse/raw_feature_bc_matrix.h5 ./mouse/exp
In [1]:
import pegasus as pg
import numpy as np

Preprocessing

In [2]:
adata = pg.read_input("./mouse/exp_demux.h5sc", select_singlets = True)
adata
2019-09-04 13:08:12,071 - sccloud - INFO - Read input is finished. Time spent = 1.15s.
Out[2]:
AnnData object with n_obs × n_vars = 3267 × 27998 
    obs: 'assignment', 'Channel'
    var: 'gene_ids'
    uns: 'genome'
In [3]:
adata.obs['assignment'].value_counts()
Out[3]:
Sample5M    502
Sample8M    467
Sample6M    408
Sample1F    397
Sample2F    397
Sample3F    376
Sample7M    367
Sample4F    353
Name: assignment, dtype: int64
In [4]:
pg.qc_metrics(adata, mito_prefix = 'mt-', min_genes = 200)
In [5]:
adata.var.keys()
Out[5]:
Index(['gene_ids', 'n_cells', 'percent_cells', 'robust',
       'highly_variable_features'],
      dtype='object')
In [6]:
adata.obs.keys()
Out[6]:
Index(['assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts',
       'percent_mito'],
      dtype='object')
In [7]:
pg.violin(adata, keys = ['n_genes', 'n_counts', 'percent_mito'], by = 'passed_qc')
Out[7]:
In [8]:
pg.scatter_matrix(adata, ['n_genes', 'n_counts', 'percent_mito'], color='passed_qc')
Out[8]:
In [9]:
pg.violin(adata, keys = ['n_cells'])
Out[9]:
In [10]:
pg.filter_data(adata)
adata
2019-09-04 13:09:48,023 - sccloud - INFO - After filteration, 3219 cells and 20353 genes are kept. Among 20353 genes, 18870 genes are robust.
Out[10]:
AnnData object with n_obs × n_vars = 3219 × 20353 
    obs: 'assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts', 'percent_mito'
    var: 'gene_ids', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features'
    uns: 'genome'
In [11]:
pg.log_norm(adata, norm_count = 1e5)
2019-09-04 13:09:51,458 - sccloud - INFO - Normalization is finished. Time spent = 0.15s.
In [12]:
pg.highly_variable_features(adata, consider_batch = False)
2019-09-04 13:09:54,349 - sccloud - INFO - 2000 highly variable features have been selected. Time spent = 0.11s.
In [13]:
pg.variable_feature_plot(adata)
Out[13]:
In [14]:
pg.pca(adata)
2019-09-04 13:10:00,770 - sccloud - INFO - PCA is done. Time spent = 0.17s.
In [15]:
pg.neighbors(adata)
2019-09-04 13:10:02,714 - sccloud - INFO - Nearest neighbor search is finished in 0.36s.
2019-09-04 13:10:02,804 - sccloud - INFO - Constructing affinity matrix is done. Time spent = 0.09s.
2019-09-04 13:10:02,805 - sccloud - INFO - Affinity matrix calculation is finished in 0.09s

Clustering

In [16]:
pg.leiden(adata, n_iter = 2)
2019-09-04 13:10:10,453 - sccloud - INFO - Graph is constructed. Time spent = 0.14s.
2019-09-04 13:10:10,796 - sccloud - INFO - Leiden clustering is done. Time spent = 0.48s.
In [17]:
pg.composition_plot(adata, by = 'leiden_labels', condition = 'assignment')
Out[17]:

Dimension Reduction

In [18]:
pg.fitsne(adata)
2019-09-04 13:10:46,175 - sccloud - INFO - FIt-SNE is calculated. Time spent = 24.94s.
In [19]:
pg.embedding(adata, basis = 'fitsne', keys = ['leiden_labels', 'assignment'])
Out[19]:
In [20]:
pg.umap(adata)
UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
     metric_kwds=None, min_dist=0.5, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=0,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Wed Sep  4 13:11:37 2019 Finding Nearest Neighbors
Wed Sep  4 13:11:37 2019 Finished Nearest Neighbor Search
Wed Sep  4 13:11:39 2019 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Sep  4 13:11:44 2019 Finished embedding
2019-09-04 13:11:44,761 - sccloud - INFO - using umap kNN graph 3219
2019-09-04 13:11:44,762 - sccloud - INFO - UMAP is calculated. Time spent = 8.01s.
In [21]:
pg.embedding(adata, basis = 'umap', keys = ['leiden_labels', 'assignment'])
Out[21]:
In [22]:
pg.diffmap(adata)
pg.fle(adata)
2019-09-04 13:11:56,602 - sccloud - INFO - Calculating connected components is done.
2019-09-04 13:11:56,621 - sccloud - INFO - Calculating normalized affinity matrix is done.
2019-09-04 13:11:56,909 - sccloud - INFO - diffmap finished. Time spent = 0.31s.
2019-09-04 13:11:57,184 - sccloud - INFO - Nearest neighbor search is finished in 0.27s.
2019-09-04 13:11:57,245 - sccloud - INFO - Constructing affinity matrix is done. Time spent = 0.06s.
2019-09-04 13:11:57,246 - sccloud - INFO - Affinity matrix calculation is finished in 0.06s
2019-09-04 13:11:57,293 - sccloud - INFO - Graph is constructed. Time spent = 0.05s.
/var/folders/4m/htf4fn9n6qzfpdj1g6q5gnyw0000gn/T/tmp1_7q5qiu.net is written.
['java', '-Djava.awt.headless=true', '-Xmx8g', '-cp', '/Users/yy939/mgh/miniconda3/envs/scCloud-test/lib/python3.7/site-packages/forceatlas2/ext/forceatlas2.jar:/Users/yy939/mgh/miniconda3/envs/scCloud-test/lib/python3.7/site-packages/forceatlas2/ext/gephi-toolkit-0.9.2-all.jar', 'kco.forceatlas2.Main', '--input', '/var/folders/4m/htf4fn9n6qzfpdj1g6q5gnyw0000gn/T/tmp1_7q5qiu.net', '--output', '/var/folders/4m/htf4fn9n6qzfpdj1g6q5gnyw0000gn/T/tmp1_7q5qiu.coords', '--nthreads', '12', '--seed', '0', '--targetChangePerNode', '2.0', '--targetSteps', '5000', '--2d']
Force-directed layout is generated.
2019-09-04 13:12:02,711 - sccloud - INFO - Force-directed layout is calculated. Time spent = 5.80s.
In [23]:
pg.embedding(adata, basis = 'fle', keys = ['leiden_labels', 'assignment'])
Out[23]:

Differential Expression Analysis

In [25]:
pg.de_analysis(adata, cluster = 'leiden_labels', fisher = True, mwu = True)
2019-09-04 13:13:00,812 - sccloud - INFO - Collecting basic statistics is done. Time spent = 8.37s.
2019-09-04 13:13:00,967 - sccloud - INFO - Converting X to csc_matrix is done. Time spent = 0.15s.
2019-09-04 13:13:29,059 - sccloud - INFO - AUROC values are calculated. Time spent = 28.09s.
2019-09-04 13:13:29,378 - sccloud - INFO - Welch's t-test is done. Time spent = 0.32s.
2019-09-04 13:13:30,260 - sccloud - INFO - Fisher's exact test is done. Time spent = 0.88s.
2019-09-04 13:13:52,936 - sccloud - INFO - Mann-Whitney U test is done. Time spent = 22.68s.
2019-09-04 13:13:53,101 - sccloud - INFO - Differential expression analysis is finished. Time spent = 60.68s.

Show a volcano plot for clusters 1 and 2.

In [26]:
pg.volcano(adata, cluster_ids=['1', '2'])
Out[26]:

Find Markers Using Gradient Boosting

In [27]:
markers = pg.find_markers(adata, label_attr = 'leiden_labels')
[1]	valid_0's multi_error: 0.0938902	valid_1's multi_error: 0.236025
Training until validation scores don't improve for 1 rounds.
[2]	valid_0's multi_error: 0.0552295	valid_1's multi_error: 0.18323
[3]	valid_0's multi_error: 0.0434933	valid_1's multi_error: 0.15528
[4]	valid_0's multi_error: 0.0303763	valid_1's multi_error: 0.136646
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224
[6]	valid_0's multi_error: 0.0210563	valid_1's multi_error: 0.130435
Early stopping, best iteration is:
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224
2019-09-04 13:14:35,106 - sccloud - INFO - LightGBM used 16.22s to train.
2019-09-04 13:14:40,040 - sccloud - INFO - find_markers took 21.23s to finish.
In [28]:
len(markers)
Out[28]:
13

Annotating Clusters

In [29]:
inferred_cell_type_results = pg.infer_cell_types(adata, markers = 'mouse_brain', de_test = 't')

Rename clusters using the inferred cell types

In [30]:
new_cluster_names = pg.infer_cluster_names(inferred_cell_type_results)
adata.rename_categories('leiden_labels', new_cluster_names)
In [31]:
pg.embedding(adata, basis='fitsne', keys=['leiden_labels'], labels_on_data=True, padding=(0.1, 0.01), width=600)
Out[31]:

Save Results

In [33]:
pg.write_output(adata, 'mouse_cortex_scc.h5ad')
2019-08-28 12:18:11,155 - sccloud - INFO - Write output is finished. Time spent = 1.50s.